The data flow for this project requires several files:

  1. getting a list of doi, for which there is doi_extract.py which runs on a wikipedia data dump using the mediawiki-utilities package by halfak
  2. getting the page views. for each page in th doi_list.txt we need to get the page views. I have done this in a janky way, but it is outlined in the Get pageviews ipython notebook.

In [1]:
import json
import re
import operator
from collections import defaultdict
import pandas as pd

In [2]:
ls


cite doi analysis.ipynb     page_views_all.json
doi_list.txt                page_views_errors.json
doi_page_titles.json        page_views.json
doi_page_views.json         page_views.json~
dumpfile.html               README.md
Finding DOIs example.ipynb  research questions.txt
Get Pageviews.ipynb         testfile.compressed.1000meg
LICENSE

In [3]:
doi_list = open('doi_list.txt')

In [4]:
doi_lines = doi_list.readlines()

In [5]:
len(doi_lines)


Out[5]:
27182

In [6]:
page_dois = defaultdict(list)
doi_pages = defaultdict(list)
prefixes = defaultdict(int)

In [7]:
for line in doi_lines:
    parts = re.split(r'\t|\n', line)
    page_title = parts[0]
    doi = parts[1].strip()
    #if len(doi) != len(parts[1]):
    #    print(parts[1], doi)
    if doi and (doi.lower() != 'noedit'):
        page_dois[page_title].append(doi)
        doi_pages[doi].append(page_title)
        prefix = doi.split('/')[0]
        prefixes[prefix] += 1

In [8]:
for doi, pages in doi_pages.iteritems():
    if doi.startswith(' '):
        print(doi, doi.strip())

In [9]:
num_page_dois = {page: len(dois) for page, dois in page_dois.iteritems()}
num_doi_pages = {doi: len(pages) for doi, pages in doi_pages.iteritems()}

In [10]:
npd = pd.DataFrame.from_dict(data=num_page_dois, orient='index')
ndp = pd.DataFrame.from_dict(data=num_doi_pages, orient='index')
prefixdf = pd.DataFrame.from_dict(data=prefixes, orient='index')

In [11]:
npdc = npd.convert_objects(convert_numeric=True)
ndpc = ndp.convert_objects(convert_numeric=True)
prefixdfc = prefixdf.convert_objects(convert_numeric=True)

In [30]:
npdc.sort([0], ascending=False).head(20)


Out[30]:
0
Induced stem cells 189
List of Ig Nobel Prize winners 91
Asymmetric hydrogenation 86
Spinal muscular atrophy 84
Crystallographic defects in diamond 80
Fluorine 78
Fullerene chemistry 54
Choosing Wisely 50
Woolly mammoth 47
Health effects of tobacco 43
Management of schizophrenia 40
2-Norbornyl cation 39
Wolff–Kishner reduction 39
Hunterian Society 37
Nanogenerator 37
Fluid queue 36
American White Ibis 35
Phenols 33
Moroccan genetics 32
Assisted colonization 32

20 rows × 1 columns


In [12]:
print npdc.sort([0], ascending=False).head(10).to_html(justify='left')


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: left;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Induced stem cells</th>
      <td> 189</td>
    </tr>
    <tr>
      <th>List of Ig Nobel Prize winners</th>
      <td>  91</td>
    </tr>
    <tr>
      <th>Asymmetric hydrogenation</th>
      <td>  86</td>
    </tr>
    <tr>
      <th>Spinal muscular atrophy</th>
      <td>  84</td>
    </tr>
    <tr>
      <th>Crystallographic defects in diamond</th>
      <td>  80</td>
    </tr>
    <tr>
      <th>Fluorine</th>
      <td>  78</td>
    </tr>
    <tr>
      <th>Fullerene chemistry</th>
      <td>  54</td>
    </tr>
    <tr>
      <th>Choosing Wisely</th>
      <td>  50</td>
    </tr>
    <tr>
      <th>Woolly mammoth</th>
      <td>  47</td>
    </tr>
    <tr>
      <th>Health effects of tobacco</th>
      <td>  43</td>
    </tr>
  </tbody>
</table>

In [13]:
print ndpc.sort([0], ascending=False).head(11).to_html(justify='left')


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: left;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10.1128/MCB.22.19.6663-6668.2002</th>
      <td> 171</td>
    </tr>
    <tr>
      <th>10.1093/icb/icr006</th>
      <td> 132</td>
    </tr>
    <tr>
      <th>10.1093/nar/gkj002</th>
      <td>  75</td>
    </tr>
    <tr>
      <th>10.1128/MCB.24.13.5797-5807.2004</th>
      <td>  66</td>
    </tr>
    <tr>
      <th>10.1088.2F0004-6256.2F141.2F5.2F170</th>
      <td>  64</td>
    </tr>
    <tr>
      <th>10.1093/emboj/20.11.2943</th>
      <td>  50</td>
    </tr>
    <tr>
      <th>10.1088/0004-637X/753/2/156</th>
      <td>  38</td>
    </tr>
    <tr>
      <th>10.1088/0067-0049/197/2/19</th>
      <td>  33</td>
    </tr>
    <tr>
      <th>accessdate = 2012-10-25</th>
      <td>  30</td>
    </tr>
    <tr>
      <th>10.3897/zookeys.242.3856</th>
      <td>  28</td>
    </tr>
    <tr>
      <th>10.1073/pnas.242603899</th>
      <td>  27</td>
    </tr>
  </tbody>
</table>

In [14]:
print ndpc.sort([0], ascending=False).head(11).index


Index([u'10.1128/MCB.22.19.6663-6668.2002', u'10.1093/icb/icr006', u'10.1093/nar/gkj002', u'10.1128/MCB.24.13.5797-5807.2004', u'10.1088.2F0004-6256.2F141.2F5.2F170', u'10.1093/emboj/20.11.2943', u'10.1088/0004-637X/753/2/156', u'10.1088/0067-0049/197/2/19', u'accessdate = 2012-10-25', u'10.3897/zookeys.242.3856', u'10.1073/pnas.242603899'], dtype='object')

In [15]:
print prefixdf.sort([0], ascending=False).head(10).to_html()


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10.1016</th>
      <td> 3398</td>
    </tr>
    <tr>
      <th>10.1038</th>
      <td> 1879</td>
    </tr>
    <tr>
      <th>10.1007</th>
      <td> 1793</td>
    </tr>
    <tr>
      <th>10.1098</th>
      <td> 1716</td>
    </tr>
    <tr>
      <th>10.1111</th>
      <td> 1350</td>
    </tr>
    <tr>
      <th>10.1093</th>
      <td> 1203</td>
    </tr>
    <tr>
      <th>10.1002</th>
      <td>  960</td>
    </tr>
    <tr>
      <th>10.1021</th>
      <td>  873</td>
    </tr>
    <tr>
      <th>10.1126</th>
      <td>  821</td>
    </tr>
    <tr>
      <th>10.1080</th>
      <td>  704</td>
    </tr>
  </tbody>
</table>

In [16]:
page_views_list = json.load(open('page_views_all.json', 'r'))
page_views = {i[0]: i[1] for i in page_views_list}

def total_page_views(page_list):
    view_count = 0
    for page in page_list:
        try:
            view_count += page_views[page.decode('utf-8')]
        except:
            pass
            #print(page)
    return view_count

doi_views = {doi: total_page_views(page_list) for doi, page_list in doi_pages.iteritems() }
viewsdf = pd.DataFrame.from_dict(data=doi_views, orient='index')

In [18]:
print viewsdf.sort([0], ascending=False).head(10).to_html()


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10.1038/460787a</th>
      <td> 2767151</td>
    </tr>
    <tr>
      <th>10.1038/462545a</th>
      <td> 2766979</td>
    </tr>
    <tr>
      <th>10.1353/cwh.1969.0065</th>
      <td> 1382286</td>
    </tr>
    <tr>
      <th>10.1145/1284621.1284635</th>
      <td> 1313264</td>
    </tr>
    <tr>
      <th>10.1371/journal.pone.0028705</th>
      <td> 1216279</td>
    </tr>
    <tr>
      <th>10.1126/science.1173983</th>
      <td>  704326</td>
    </tr>
    <tr>
      <th>10.1038/nature06949</th>
      <td>  646251</td>
    </tr>
    <tr>
      <th>10.1073/pnas.0805721105</th>
      <td>  646251</td>
    </tr>
    <tr>
      <th>10.1080/10807030802387556</th>
      <td>  624108</td>
    </tr>
    <tr>
      <th>10.1098/rsbm.1955.0005</th>
      <td>  544353</td>
    </tr>
  </tbody>
</table>

In [23]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [29]:
npd.describe()


Out[29]:
0
count 11101.000000
mean 2.319791
std 3.955982
min 1.000000
25% 1.000000
50% 1.000000
75% 2.000000
max 189.000000

8 rows × 1 columns


In [42]:
len(npdc[npdc[0] == 1]) / float(len(npdc))


Out[42]:
0.605981443113233

In [35]:
p = npd.hist(bins=40, log=True)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-35-ff2de86e1116> in <module>()
      1 p = npd.hist(bins=40, log=True)
----> 2 p.title('DOI citations among Wikipedia articles')

AttributeError: 'numpy.ndarray' object has no attribute 'title'

In [ ]: